Alexander Feldman V.1.2
This project investigate user behavior for the company's app.
The goals of the project:
#!pip install seaborn --upgrade
#!pip install plotly --upgrade
# import libraries
import math
import pandas as pd
import numpy as np
import scipy.stats as st
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly import graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
# open the dataset
try:
data = pd.read_csv('/datasets/logs_exp_us.csv', sep = '\t') # path for working on the platform
except:
data = pd.read_csv('datasets/logs_exp_us.csv', sep = '\t') # path for local working
data.head()
display(data.info(), data.describe())
print('Dataframe has', len(data[data.duplicated()]),'duplicated values')
No missing values. There are 413 duplicate rows. We should correction type of some columns.
# rename columns
data.columns=['event', 'user_id', 'datetime', 'group']
# Handle duplicate rows
data = data.drop_duplicates()
# check whether users in only one group
both_246_247 = data[data['group']==246][data[data['group']==246]['user_id']
.isin(data[data['group']==247]['user_id'])]
both_246_248 = data[data['group']==246][data[data['group']==246]['user_id']
.isin(data[data['group']==248]['user_id'])]
both_247_248 = data[data['group']==247][data[data['group']==247]['user_id']
.isin(data[data['group']==248]['user_id'])]
print('Are there users who is in 246 and 247 groups?', len(both_246_247)>0)
print('Are there users who is in 246 and 248 groups?', len(both_246_248)>0)
print('Are there users who is in 247 and 248 groups?', len(both_247_248)>0)
# Separate datetime column and correction of type of columns
data['datetime'] = pd.to_datetime(data['datetime'], unit='s')
data['date'] = data['datetime'].astype('datetime64[D]')
data['group'] = data['group'].astype('str')
The Dataframe is ready for analysis
#group data
events_number = data.groupby(['event'], as_index=False)['user_id'].count()
events_number = events_number.rename(columns={'user_id':'count_events'})
# plot bar chart
fig = px.bar(events_number, x="event", y="count_events", text="count_events",
color_discrete_sequence=px.colors.qualitative.Set3)
fig.update_traces(texttemplate='%{text}', textposition='auto')
fig.update_layout(yaxis=dict(title='Number of events'), xaxis=dict(title='Type of events'),
title={'text':'Number of events by groups', 'x':0.5})
fig.show()
As you can see from the graph, there are 119101 events on the MainScreen. CartScreen and OffersScreen have about 46808 and 42668 events, respectively, PaymentScreen has 34118 events, and Tutorial only 1018.
users_number = data.groupby('group', as_index=False).agg({'user_id':'nunique'})
fig = px.pie(users_number, values='user_id', names='group', color_discrete_sequence=px.colors.qualitative.Set3)
fig.update_traces(textinfo='value + percent')
fig.update_layout(legend_title_text='Group', title={'text':'Number of unique users by group', 'x':0.5})
fig.show()
The groups are about the same size. The difference between the shares of groups is less than 0.7% of the total number of users.
event_per_user = data.groupby(['group','user_id'], as_index=False).agg({'event':'count'})
# make violin plot for three groups:
fig = px.violin(event_per_user, y="event", x="group", box=True, points="all", color='group',
title='The distribution of the number of events by users by group',
color_discrete_sequence=px.colors.qualitative.Set1)
fig.show()
print('The average number of events by user is {:.2f}'.format(event_per_user['event'].mean()))
print('The median of number of events by user is {:.2f}'.format(event_per_user['event'].median()))
As we can see from the graph there are a some abnormal users who have a huge number of events. It's outliers. Drop outlier users, who have more than 300 events.
abnormal_user_list = event_per_user[event_per_user['event'] > 300]['user_id']
data = data[~data['user_id'].isin(abnormal_user_list)].reset_index(drop=True)
users_number_2 = data.groupby('group', as_index=False).agg({'user_id':'nunique'})
print('The studed period of time is {} days: from {:%Y-%m-%d} to {:%Y-%m-%d}'.format(
(data['date'].max() - data['date'].min()).days, data['date'].min(), data['date'].max()))
# plot a histogram
fig = px.histogram(data, x="datetime", color_discrete_sequence=px.colors.qualitative.Set3)
fig.update_layout(title={'text':'Distribution of events by date and time', 'x':0.5})
fig.show()
As we can see from the histogram the real time period is from 1 aug 2019 to 7 aug 2019.
Such technical data problems should not prevent us from analyzing, provided that the data of some users is not confused with the data of other users.
# drop the abnormal period
data = data[data['date'] >= '2019-08-01'].reset_index(drop=True)
print('The new period of time is {} days: from {:%Y-%m-%d} to {:%Y-%m-%d}'.format(
(data['date'].max() - data['date'].min()).days, data['date'].min(), data['date'].max()))
# the change of number of events
events_number_before = events_number.groupby('event', as_index=False)['count_events'].sum()
events_number_after = data.groupby('event', as_index=False)['user_id'].count()
events_number_after = events_number_after.rename(columns={'user_id':'count_events'})
fig = go.Figure()
fig.add_trace(go.Bar(
x=events_number_before['event'],
y=events_number_before['count_events'],
name='Before', text=events_number_before['count_events']
))
fig.add_trace(go.Bar(
x=events_number_after['event'],
y=events_number_after['count_events'],
name='After', text=events_number_after['count_events']
))
fig.update_traces(texttemplate='%{text}', textposition='auto')
fig.update_layout(yaxis=dict(title='Number of events'), xaxis=dict(title='Type of events'),
title={'text':'Number of events before and after handle', 'x':0.5})
fig.show()
As we can see from the graph number of events reduce for all of events. The most losses for CartScreen (-24,5%) and Payment Screen (-28%) events.
# the change of number of users
users_number_3 = data.groupby('group', as_index=False).agg({'user_id':'nunique'})
fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=users_number['group'], values=users_number['user_id']),1,1)
fig.add_trace(go.Pie(labels=users_number_2['group'], values=users_number_2['user_id']),1,2)
fig.add_trace(go.Pie(labels=users_number_3['group'], values=users_number_3['user_id']),1,3)
fig.update_traces(hole=.1, textinfo='value + percent')
fig.update_layout(
title_text="How many users did we lose for a handle?",
annotations=[dict(text='Raw data', x=0.1, y=0.9, font_size=15, showarrow=False),
dict(text='Drop abnormal users', x=0.5, y=0.9, font_size=15, showarrow=False),
dict(text='Drop abnormal days', x=0.95, y=0.9, font_size=15, showarrow=False)])
fig.show()
The losess of users less than 1%. The proportions of groups almost didn't change.
# Plot distributions of events
plt.figure(figsize=(18,6))
sns.kdeplot(data=data, x='datetime', hue='event', fill=True)
plt.title('The frequency of events by types', fontdict={'size':15})
plt.xlabel('Time')
plt.gca().spines["top"].set_alpha(0.0)
plt.gca().spines["bottom"].set_alpha(0.3)
plt.gca().spines["right"].set_alpha(0.0)
plt.gca().spines["left"].set_alpha(0.3)
plt.show()
The event MainScreen has the largest frequency. Then OffersScreen, CartScreen, PaymentScreen. The last place is frequency has the event Tutorial.
It is noteworthy that on weekdays the number of visits to MainScreen is noticeably higher than on weekends (3/08, 4/08).
# Plot graph
event_users = data.groupby('event', as_index=False).agg({'user_id':'nunique'})
event_users = event_users.rename(columns={'user_id':'n_users'})
event_users = event_users.sort_values('n_users', ascending=False)
fig = px.bar(event_users, x="event", y="n_users", text="n_users", color='event',
color_discrete_sequence=px.colors.qualitative.Set3)
fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(yaxis=dict(title='Number of users'), xaxis=dict(title='Type of events'),
title={'text':'Number of users by events', 'x':0.5})
fig.show()
fig = px.pie(event_users, values='n_users', names='event',
color_discrete_sequence=px.colors.qualitative.Set3)
fig.update_traces(textinfo='value + percent')
fig.update_layout(legend_title_text='Events',
title={'text':' The proportion of users who performed the action at least once', 'x':0.5})
fig.show()
Almost 40% (7387) of users are MainScreen, 23% (4561) of users are OfferScreen, 18,5% (3702), and 17,5% (3507) of users are CartScreen and PaymentScreen respectively. And only 4% (835) of ones are Tutorial.
The logic dictates that the order of events in the funnel should look like this:
Tutorial does not participate in the funnel because users can navigate to Tutorial from any stage.
Let's check an order of passing of users for funnel. Whether abnormal ways?
stage = ['MainScreenAppear', 'OffersScreenAppear', 'CartScreenAppear', 'PaymentScreenSuccessful']
for i in range(1,4):
check_stage = data[data['event']==stage[i]][~data[data['event']==stage[i]]['user_id']
.isin(data[data['event']==stage[i-1]]['user_id'])]
n_user = check_stage['user_id'].nunique()
print('{} users from the stage {} have never been on the stage {}'.format(n_user, stage[i], stage[i-1]))
As we can see, there are several abnormal users in terms of funnels.
I guess this is an echo of technical problems with data noticed above, when older events were reflected as newer.
This should not affect the results of the analysis, since we ourselves will correctly distribute events in the funnel.
However, we will share our findings with the developers.
# drop Tutorial from funnel
funnel_users = event_users[event_users['event']!='Tutorial']
fig = go.Figure(go.Funnel(
y = funnel_users['event'],
x = funnel_users['n_users'],
textinfo = 'percent previous + value',
marker = {"color": ["rgb(128,177,211)", "rgb(253,180,98)", "rgb(179,222,105)", "rgb(217,217,217)"]}
))
fig.update_layout(title={'text':'Events funnel with share of users from the previous stage', 'x':0.5},
yaxis=dict(title='Events steps'))
fig.show()
As we can see from the funnel graph we lose the most users for the OffersScreen stage (62%). The most successful stage is PaymentScreen (95%). The last number say us about the very successful realization of a payment step.
This values show us also churn rate of the funnel (as the share of previous - 1): 0% -> -38% -> -19% -> -5% respectively.
The biggest concern is the transition from the MainScreen to the OffersScreen. Only 62% of users go through this stage. To improve the conversion of this step, we can try redesigning the MainScreen and improving the links to the OffersScreen. Next, it's worth testing the hypothesis that the new design will improve the conversion of this stage.
# Group data
user_events = data[data['event']!='Tutorial'].groupby('user_id', as_index=False).agg({'event':'nunique'})
user_full_funnel = user_events[user_events['event']==4]
remain_users = user_events[user_events['event']<4]
# plot a graph
fig = go.Figure(data=[go.Pie(
labels=['The entire journey users', 'The remain users'],
values=[len(user_full_funnel), len(remain_users)]
)])
fig.update_traces(textinfo='value + percent', hole=.2)
fig.update_layout(legend_title_text='Events', title={'text':'Share of users make the entire journey via a funnel', 'x':0.5})
fig.show()
45% of users made the entire journey via a funnel. In fact, this conversion rate. It's great value for CR.
users_number = data.groupby('group', as_index=False).agg({'user_id':'nunique'})
fig = px.pie(users_number, values='user_id', names='group',
color_discrete_sequence=px.colors.qualitative.Set3)
fig.update_traces(textinfo='value + percent')
fig.update_layout(legend_title_text='Group', title={'text':'Number of unique users by group', 'x':0.5})
fig.show()
We have groups that are almost similar in the number of users and proportions.
# Group data
pivot = data.pivot_table(index='event', values='user_id', columns='group', aggfunc='nunique').reset_index()
pivot
Since we have multiple testing we need to correct up our alpha level. Let's apply the Holm method of correction. For this, we will calculate the alpha level for each test depending on the number of tests, and depending on which test it is.
Totally we will do 20 tests.
base alpha level = 0.05
# make a test counter
m = 20 + 1 # add +1 for a technical reason
def check_hypothesis(group1,group2, event, alpha):
alpha = alpha / m #correction for alpha
if len(group1) > 3:
group1_1 = group1[:3]
group1_2 = group1[-3:]
successes1=(pivot[pivot['event']==event][group1_1].iloc[0]) + (pivot[pivot['event']==event][group1_2].iloc[0])
trials1=data[data['group']==group1_1]['user_id'].nunique() + data[data['group']==group1_2]['user_id'].nunique()
else:
successes1=pivot[pivot['event']==event][group1].iloc[0]
trials1=data[data['group']==group1]['user_id'].nunique()
successes2=pivot[pivot['event']==event][group2].iloc[0]
trials2=data[data['group']==group2]['user_id'].nunique()
#proportion for success in the first group
p1 = successes1/trials1
#proportion for success in the second group
p2 = successes2/trials2
# proportion in a combined dataset
p_combined = (successes1 + successes2) / (trials1 + trials2)
difference = p1 - p2
z_value = difference / math.sqrt(p_combined * (1 - p_combined) * (1/trials1 + 1/trials2))
distr = st.norm(0, 1)
p_value = (1 - distr.cdf(abs(z_value))) * 2
print ('The corrected alpha: ', alpha)
print('p-value: ', p_value)
if (p_value < alpha):
print("We reject the null hypothesis for",event, 'for groups',group1,'and',group2)
else:
print("We can't reject the null hypothesis for", event,'for groups',group1,'and',group2)
for i in pivot['event'].unique():
m = m - 1 # correction the test counter
check_hypothesis('246', '247', i, alpha=0.05)
As we can see A/A test has done successful. The groups are equal for all events.
for i in pivot['event'].unique():
m = m - 1 # correction the test counter
check_hypothesis('246', '248', i, alpha=0.05)
As we can see A/B test #1 has done successful. The groups are equal for all events.
for i in pivot['event'].unique():
m = m - 1 # correction the test counter
check_hypothesis('247', '248', i, alpha=0.05)
As we can see A/B test #2 has done successful. The groups are equal for all events.
for i in pivot['event'].unique():
m = m - 1 # correction the test counter
check_hypothesis('246_247', '248', i, alpha=0.05)
As we can see combined A/B test #3 has done successful. The groups are equal for all events.
During testing, we applied an alpha correction using the Holm method, so we took into account the likelihood of testing errors.
We had the following of the goals of the project:
We successfully figured and studied the funnel of sales and made A/A/B tests for applying a new font for the design of the app.